---
title: "Topic model"
author: "Katrina Keegan"
date: "10/6/2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(topicmodels)
library(tidytext)
library(ggraph)
library(extrafont)
```

```{r, warning = FALSE}
# Creating the document term matrix

# Change the file name (file must be in the folder where this code is)

tokenized_texts = read.csv("TOKENS_sputnik_rs_vaccine.csv", encoding = "UTF-8")

counted_tokenized_texts = tokenized_texts %>%
  select(token, index) %>%
  filter(token != "") %>%
  filter(token != "moći") %>%
  filter(token != "reći") %>%
  filter(token != "kazati") %>%
  filter(token != "zbog") %>%
  filter(token != "već")%>%
  filter(token != "već") %>%
  group_by(index, token) %>%
  summarize(count = n(), .groups = "drop")
  
dtm <- counted_tokenized_texts %>% 
  cast_dtm(index, token, count)

```


```{r}
# Pick and LDA model
# First change the k variable over a few options, and run the folowing code chunk to see which one makes the most sense.
# Then change the seed to a few options to see if randomness makes one that is more sensible than the others
# Save all the previous ones as it takes a long time to run on big data and you don't want to have to rerun if possible.

#This is the LDA function from the topicmodels package to actually perform the topic modelling. The control = list(seed) is supposed to make sure that the randomized output is the same each time.
#k is the number of topics.

lda_2 <- LDA(dtm, k = 2,control = list(seed = 48))
lda_3 <- LDA(dtm, k = 3,control = list(seed = 48))
lda_4 <- LDA(dtm, k = 4, control = list(seed = 48))
lda_5 <- LDA(dtm, k = 5, control = list(seed = 48))
```


```{r}
# View the words in each topic

#the tidy function from tidytext returns the topic model results to a tibble.

tidy_lda <- tidy(lda_2)

#The following slices the top 20 words for each topic

# Add or remove topics here as needed, and change the name of the topic to its actual name
top_20 <- tidy_lda %>%
  group_by(topic) %>%
  filter(term != "reci") %>%
  slice_max(beta, n = 30) %>%
  mutate(topic = replace(topic, topic == 1, "topic 1")) %>%
  mutate(topic = replace(topic, topic == 2, "topic 2")) %>%
  mutate(topic = replace(topic, topic == 3, "topic 3")) 
```


```{r}
# Add translations

```


```{r}
# Make the graph showing key words

top_20 %>%
  #the following code is copy and pasted from https://www.tidytextmining.com/nasa.html#interpreting-the-topic-model
  #The goal is to have each of the topics show words from high beta to low beta for each topic (rather than overall beta ranking)
  mutate(term = reorder_within(term, beta, topic)) %>%
  group_by(topic, term) %>%
  arrange(desc(beta)) %>%
  ungroup() %>%
  ggplot(aes(x = beta, y = term, fill = topic)) +
  geom_col(show.legend = FALSE) +
  #this refers back to the ordering in reorder_within()
  scale_y_reordered() +
  #scales = "free" means that it won't keep the same y scale (the same terms) for every topic
  facet_wrap(~topic, scales = "free", ncol = 3) +
  scale_x_continuous(n.breaks = 4, labels = scales::percent) +
  #scale_fill_manual(values = c("#3A6E8B", "#64BAAC", "#777877")) +
  labs(x = "Percent of the topic comprised of the word", y = "", title = "") +
  theme_bw() +
  theme(axis.title = element_text(size = 8))
  

```


```{r}
#Finding what the main topic is for each article
gamma <- tidy(desc_lda, matrix = "gamma") %>%
  mutate(topic = replace(topic, topic == 1, "topic 1")) %>%
  mutate(topic = replace(topic, topic == 2, "topic 2")) %>%
  mutate(topic = replace(topic, topic == 3, "topic 3"))

main_topic = gamma %>%
  group_by(document) %>%
  mutate(max_gamma = max(gamma)) %>%
  filter(gamma == max(gamma)) %>%
  group_by(topic) %>%
  summarize(number_articles = n(), .groups = "drop") %>%
  mutate(percent = number_articles / sum(number_articles)) %>%
  mutate(percent = round(percent, 3)) %>%
  mutate(labels = scales::percent(percent))

```


```{r}
#How to make a pie chart here: https://r-charts.com/part-whole/pie-chart-percentages-ggplot2/


main_topic %>%
  ggplot(aes(x = "", y = percent, fill = topic)) +
  geom_col() +
  geom_text(aes(label = labels),
            position = position_stack(vjust = 0.5),
            size = 8) +
  coord_polar(theta = "y") +
  theme_void() +
  labs(title = "", fill = "") +
  theme(legend.position = "right", legend.text = element_text(size = 20)) #+
  #scale_fill_manual(values = c("#3A6E8B", "#64BAAC", "#777877"))
```



